df <- st_read('data/CA16_CTs_all.gpkg')
## Reading layer `CA16_CTs_all' from data source `/Users/sn/projects/connectivity-and-lfp/data/CA16_CTs_all.gpkg' using driver `GPKG'
## Simple feature collection with 5608 features and 47 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: -124.6992 ymin: 42.01779 xmax: -52.61941 ymax: 60.00006
## geographic CRS: WGS 84
df$percent_hh_with_children <- 100 * df$hh_with_children / df$hh_total
df$percent_drivers_female <- 100 * df$commute_driver_female / (df$commute_driver_male + df$commute_driver_female)
df$percent_publictransit_female <- 100 * df$commute_publictransit_female / (df$commute_publictransit_male + df$commute_publictransit_female)
df$lfp_gap <- df$lfp_male - df$lfp_female
df <- na.omit(df)
df_no_geom <- st_drop_geometry(df)
descriptives
iv_colnames <- c("pca1_stock", "med_hh_income_1000", "avg_rooms_per_dwelling", "percent_hh_with_children", "percent_drivers_female", "percent_publictransit_female")
lfp <- rbind(data.frame(lfp = df$lfp_female, gender ='Female'), data.frame(lfp = df$lfp_male, gender ='Male'))
ggplot(lfp, aes(x=lfp, fill=gender)) + geom_histogram(alpha=0.5, position="identity") + scale_x_continuous(breaks=seq(0,100,10)) + labs(x='LFP (%)')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

avg_gap <- mean(df$lfp_gap)
ggplot(df_no_geom, aes(lfp_gap)) + geom_histogram(color="black", fill="white", binwidth = 2) + geom_vline(xintercept = avg_gap, color='red') + labs(x='percentage points')

avg_sndi <- mean(df$pca1_stock)
ggplot(df, aes(pca1_stock)) + geom_histogram(color="black", fill="white") + geom_vline(xintercept=avg_sndi, color='red') + labs(x='SNDI')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

st_drop_geometry(df[iv_colnames]) %>% summarise_all(mean) %>% t()
## [,1]
## pca1_stock 2.226660
## med_hh_income_1000 78.232846
## avg_rooms_per_dwelling 6.168332
## percent_hh_with_children 40.896490
## percent_drivers_female 44.342552
## percent_publictransit_female 57.126427
determinants of LFP
df_vars <- df_no_geom[iv_colnames]
df_vars$lfp_female <- df_no_geom$lfp_female
model_all <- lm(lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + percent_hh_with_children + percent_drivers_female + percent_publictransit_female , data=df_vars)
summary(model_all)
##
## Call:
## lm(formula = lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling +
## percent_hh_with_children + percent_drivers_female + percent_publictransit_female,
## data = df_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.865 -4.210 0.369 4.732 41.482
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.169783 1.061722 43.486 < 2e-16 ***
## pca1_stock -0.492749 0.072938 -6.756 1.57e-11 ***
## med_hh_income_1000 0.176581 0.005692 31.022 < 2e-16 ***
## avg_rooms_per_dwelling -2.767874 0.160250 -17.272 < 2e-16 ***
## percent_hh_with_children 0.071829 0.010555 6.805 1.12e-11 ***
## percent_drivers_female 0.364052 0.023530 15.472 < 2e-16 ***
## percent_publictransit_female 0.017646 0.007318 2.411 0.0159 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.348 on 5418 degrees of freedom
## Multiple R-squared: 0.2095, Adjusted R-squared: 0.2087
## F-statistic: 239.4 on 6 and 5418 DF, p-value: < 2.2e-16
plot(residuals(model_all), ylab='residuals')
abline(h=0, col='red', lw=2)

plot(residuals(model_all) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_all), main=NULL)
qqline(residuals(model_all))

bptest(model_all) # heteroskedasticity
##
## studentized Breusch-Pagan test
##
## data: model_all
## BP = 687.66, df = 6, p-value < 2.2e-16
model_no_sndi <- lm(lfp_female ~ . -pca1_stock, data=df_vars)
anova(model_no_sndi, model_all)
## Analysis of Variance Table
##
## Model 1: lfp_female ~ (pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling +
## percent_hh_with_children + percent_drivers_female + percent_publictransit_female) -
## pca1_stock
## Model 2: lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling +
## percent_hh_with_children + percent_drivers_female + percent_publictransit_female
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5419 294965
## 2 5418 292501 1 2463.9 45.639 1.571e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
multilevel model
df_vars$csd_uid <- df$csd_uid
model_csd <- lmer(lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling + percent_hh_with_children + percent_drivers_female + percent_publictransit_female + (1 | csd_uid), data=df_vars)
summary(model_csd)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula:
## lfp_female ~ pca1_stock + med_hh_income_1000 + avg_rooms_per_dwelling +
## percent_hh_with_children + percent_drivers_female + percent_publictransit_female +
## (1 | csd_uid)
## Data: df_vars
##
## REML criterion at convergence: 36633.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -6.9213 -0.5249 0.0599 0.5894 5.9513
##
## Random effects:
## Groups Name Variance Std.Dev.
## csd_uid (Intercept) 13.61 3.689
## Residual 46.70 6.833
## Number of obs: 5425, groups: csd_uid, 390
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 4.809e+01 1.271e+00 5.034e+03 37.825 < 2e-16
## pca1_stock -6.062e-01 7.769e-02 5.055e+03 -7.802 7.33e-15
## med_hh_income_1000 1.822e-01 6.557e-03 4.740e+03 27.789 < 2e-16
## avg_rooms_per_dwelling -3.054e+00 1.760e-01 5.196e+03 -17.359 < 2e-16
## percent_hh_with_children 1.055e-01 1.152e-02 5.346e+03 9.157 < 2e-16
## percent_drivers_female 3.430e-01 2.464e-02 5.405e+03 13.921 < 2e-16
## percent_publictransit_female 2.100e-02 7.270e-03 5.087e+03 2.889 0.00389
##
## (Intercept) ***
## pca1_stock ***
## med_hh_income_1000 ***
## avg_rooms_per_dwelling ***
## percent_hh_with_children ***
## percent_drivers_female ***
## percent_publictransit_female **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) pc1_st m___10 avg___ prc___ prcnt_d_
## pca1_stock 0.102
## md_hh__1000 0.221 0.054
## avg_rms_pr_ -0.287 -0.230 -0.671
## prcnt_hh_w_ -0.037 -0.142 -0.043 -0.445
## prcnt_drvr_ -0.772 -0.065 -0.064 -0.152 0.163
## prcnt_pblc_ -0.295 0.022 0.120 -0.018 -0.093 -0.028
plot(residuals(model_csd), ylab='residuals')

plot(residuals(model_csd) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_csd))
qqline(residuals(model_csd))

LFP gender diff ~ .
df_vars$lfp_gap <- df$lfp_gap
model_gap <- lm(lfp_gap ~ . -lfp_female -csd_uid, data=df_vars)
summary(model_gap)
##
## Call:
## lm(formula = lfp_gap ~ . - lfp_female - csd_uid, data = df_vars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47.333 -2.333 -0.028 2.428 17.017
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.042585 0.570525 38.636 < 2e-16 ***
## pca1_stock 0.314628 0.039194 8.027 1.21e-15 ***
## med_hh_income_1000 0.017064 0.003059 5.579 2.54e-08 ***
## avg_rooms_per_dwelling -0.282646 0.086112 -3.282 0.00104 **
## percent_hh_with_children 0.024156 0.005672 4.259 2.09e-05 ***
## percent_drivers_female -0.305210 0.012644 -24.139 < 2e-16 ***
## percent_publictransit_female -0.026643 0.003933 -6.775 1.38e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.948 on 5418 degrees of freedom
## Multiple R-squared: 0.1459, Adjusted R-squared: 0.1449
## F-statistic: 154.2 on 6 and 5418 DF, p-value: < 2.2e-16
plot(residuals(model_gap), ylab='residuals')

plot(residuals(model_gap) ~ df_vars$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_gap), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_gap))

Comparative by SNDI
df_no_geom %>%
group_by(csd_uid) %>%
summarise(sndi_csd = mean(pca1_stock), pop_csd = sum(population)) %>%
filter(pop_csd > 1000000) %>%
arrange(sndi_csd)
## # A tibble: 3 x 3
## csd_uid sndi_csd pop_csd
## <chr> <dbl> <int>
## 1 2466023 0.464 1702338
## 2 3520005 1.51 2722663
## 3 4806016 2.53 1238541
low_sndi <- filter(df, csd_uid == '2466023')
high_sndi <- filter(df, csd_uid == '4806016')
Low SNDI
ggplot(low_sndi, aes(fill=lfp_female)) + geom_sf() + scale_fill_viridis_b() + labs(fill='LFP (%)') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

ggplot(low_sndi, aes(fill=pca1_stock)) + geom_sf() + scale_fill_viridis_b() + labs(fill='SNDI')+ theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

low_sndi_reg <- st_drop_geometry(low_sndi[iv_colnames])
low_sndi_reg$lfp_female <- low_sndi$lfp_female
model_low <- lm(lfp_female ~ ., data=low_sndi_reg)
summary(model_low)
##
## Call:
## lm(formula = lfp_female ~ ., data = low_sndi_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -33.609 -4.252 0.335 5.195 17.346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.66045 3.96479 11.516 < 2e-16 ***
## pca1_stock -1.99298 0.36715 -5.428 9.24e-08 ***
## med_hh_income_1000 0.14596 0.03271 4.462 1.02e-05 ***
## avg_rooms_per_dwelling 1.34355 0.95630 1.405 0.160714
## percent_hh_with_children -0.33336 0.04929 -6.763 4.12e-11 ***
## percent_drivers_female 0.22963 0.06538 3.512 0.000489 ***
## percent_publictransit_female 0.08154 0.06121 1.332 0.183440
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.845 on 459 degrees of freedom
## Multiple R-squared: 0.3333, Adjusted R-squared: 0.3246
## F-statistic: 38.24 on 6 and 459 DF, p-value: < 2.2e-16
plot(residuals(model_low), ylab='residuals')

plot(residuals(model_low) ~ low_sndi_reg$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_low), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_low))

High SNDI
ggplot(high_sndi, aes(fill=lfp_female)) + geom_sf() + scale_fill_viridis_b() + labs(fill='LFP (%)') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

ggplot(high_sndi, aes(fill=pca1_stock)) + geom_sf() + scale_fill_viridis_b() + labs(fill='SNDI') + theme(axis.text.x = element_blank(), axis.text.y = element_blank(), axis.ticks = element_blank(), rect = element_blank())

high_sndi_reg <- st_drop_geometry(high_sndi[iv_colnames])
high_sndi_reg$lfp_female <- high_sndi$lfp_female
model_high <- lm(lfp_female ~ ., data=high_sndi_reg)
summary(model_high)
##
## Call:
## lm(formula = lfp_female ~ ., data = high_sndi_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.8531 -3.1539 0.0895 3.1920 12.5510
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.87653 5.97527 4.498 1.11e-05 ***
## pca1_stock -0.81836 0.32834 -2.492 0.0134 *
## med_hh_income_1000 0.11516 0.01985 5.802 2.27e-08 ***
## avg_rooms_per_dwelling -5.05750 0.51895 -9.746 < 2e-16 ***
## percent_hh_with_children 0.16178 0.03985 4.060 6.83e-05 ***
## percent_drivers_female 0.92715 0.10393 8.921 < 2e-16 ***
## percent_publictransit_female 0.29001 0.05487 5.285 3.02e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.988 on 220 degrees of freedom
## Multiple R-squared: 0.4963, Adjusted R-squared: 0.4825
## F-statistic: 36.12 on 6 and 220 DF, p-value: < 2.2e-16
plot(residuals(model_high), ylab='residuals')

plot(residuals(model_high) ~ high_sndi_reg$lfp_female, ylab='residuals', xlab='fitted value')

qqnorm(residuals(model_high), main='Residual QQ Plot (normal dist)')
qqline(residuals(model_high))

grouped by SNDI?
df_sndi <- data.frame(sndi=df_no_geom$pca1_stock, sndi_level=if_else(df_no_geom$pca1_stock<2.52, '>= avg', '< avg'), lfp=df_no_geom$lfp_female)
ggplot(df_sndi, aes(x=sndi, y=lfp)) + geom_point(alpha=0.2) + geom_smooth(method='lm', se=FALSE, aes(color = sndi_level)) + labs(y='Female LFP (%)', x='SNDI', color='SNDI') +theme_bw()
## `geom_smooth()` using formula 'y ~ x'
